## pval_cutoff: 0.05
## lfc_cutoff: 0.585
## low_counts_cutoff: 10
General statistics
# Number of samples
length(counts_data)
## [1] 6
# Number of genes
nrow(counts_data)
## [1] 43432
# Total counts
colSums(counts_data)
## SRR15202006 SRR15202007 SRR15202008 SRR15202009 SRR15202010 SRR15202011
## 12633373 11169811 11283591 10889859 11241553 8610959

Create DDS objects
# Create DESeqDataSet object
dds <- get_DESeqDataSet_obj(counts_data, ~ treatment)
## [1] TRUE
## [1] TRUE
## [1] "DESeqDataSet object of length 43432 with 0 metadata columns"
## [1] "DESeqDataSet object of length 17151 with 1 metadata column"
colData(dds)
## DataFrame with 6 rows and 2 columns
## treatment label
## <factor> <character>
## SRR15202006 control control
## SRR15202007 control control
## SRR15202008 control control
## SRR15202009 diabetes diabetes
## SRR15202010 diabetes diabetes
## SRR15202011 diabetes diabetes
Sample-to-sample comparisons
# Transform data (blinded rlog)
rld <- get_transformed_data(dds)
PCA plot
pca <- rld$pca
pca_df <- cbind(as.data.frame(colData(dds)) %>% rownames_to_column(var = 'name'), pca$x)
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 7.7864 6.3073 4.4317 3.8287 3.39901 5.433e-14
## Proportion of Variance 0.4145 0.2720 0.1343 0.1002 0.07899 0.000e+00
## Cumulative Proportion 0.4145 0.6865 0.8208 0.9210 1.00000 1.000e+00
ggplot(pca_df, aes(x = PC1, y = PC2, color = label)) +
geom_point() +
geom_text(aes(label = name), position = position_nudge(y = -2), show.legend = F, size = 3) +
scale_color_manual(values = colors_default) +
scale_x_continuous(expand = c(0.2, 0))

Correlation heatmap
pheatmap(
cor(rld$matrix),
annotation_col = as.data.frame(colData(dds)) %>% select(label),
color = brewer.pal(8, 'YlOrRd')
)

Wald test results
# DE analysis using Wald test
dds_full <- DESeq(dds)
colData(dds_full)
## DataFrame with 6 rows and 3 columns
## treatment label sizeFactor
## <factor> <character> <numeric>
## SRR15202006 control control 1.204378
## SRR15202007 control control 1.032141
## SRR15202008 control control 1.025126
## SRR15202009 diabetes diabetes 1.016402
## SRR15202010 diabetes diabetes 1.027306
## SRR15202011 diabetes diabetes 0.762996
# Wald test results
res <- results(
dds_full,
contrast = c('treatment', condition, control),
alpha = pval_cutoff
)
res
## log2 fold change (MLE): treatment diabetes vs control
## Wald test p-value: treatment diabetes vs control
## DataFrame with 17151 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSMUSG00000051951 2.95090 0.2797033 1.194292 0.234200 0.8148297 NA
## ENSMUSG00000025900 7.97288 -2.1413277 3.306479 -0.647616 0.5172335 0.954317
## ENSMUSG00000033845 601.46353 -0.0187531 0.131829 -0.142253 0.8868800 0.997519
## ENSMUSG00000102275 11.37339 0.1271374 0.616188 0.206329 0.8365339 0.995909
## ENSMUSG00000025903 784.45488 -0.3325434 0.136983 -2.427628 0.0151979 0.239833
## ... ... ... ... ... ... ...
## ENSMUSG00000099876 42.68208 0.204127 0.345933 0.590077 0.5551392 0.959866
## ENSMUSG00000068457 262.27768 -0.402218 0.195007 -2.062587 0.0391519 0.408271
## ENSMUSG00000069045 765.28122 -0.194585 0.123372 -1.577224 0.1147439 0.644345
## ENSMUSG00000101059 4.62892 -0.766518 1.020491 -0.751127 0.4525764 NA
## ENSMUSG00000096768 223.28712 -0.472724 0.694267 -0.680897 0.4959370 0.952825
mcols(res)
## DataFrame with 6 rows and 2 columns
## type description
## <character> <character>
## baseMean intermediate mean of normalized c..
## log2FoldChange results log2 fold change (ML..
## lfcSE results standard error: trea..
## stat results Wald statistic: trea..
## pvalue results Wald test p-value: t..
## padj results BH adjusted p-values
summary(res)
##
## out of 17151 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 220, 1.3%
## LFC < 0 (down) : 166, 0.97%
## outliers [1] : 5, 0.029%
## low counts [2] : 2328, 14%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
plotDispEsts(dds_full)

Summary details
# Upregulated genes (LFC > 0)
res_sig_df %>% filter(log2FoldChange > 0)
# Downregulated genes (LFC < 0)
res_sig_df %>% filter(log2FoldChange < 0)
# Outliers (pvalue and padj are NA)
res[which(is.na(res$pvalue)), ]
## log2 fold change (MLE): treatment diabetes vs control
## Wald test p-value: treatment diabetes vs control
## DataFrame with 5 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSMUSG00000058207 13.5434 -6.11313 3.35524 -1.821964 NA NA
## ENSMUSG00000027360 43.9822 2.26098 1.47672 1.531080 NA NA
## ENSMUSG00000029368 113.3851 -6.56226 2.43612 -2.693731 NA NA
## ENSMUSG00000030324 111.9962 -1.67520 1.72582 -0.970664 NA NA
## ENSMUSG00000034837 44.1043 -1.47704 1.66797 -0.885530 NA NA
# Low counts (only padj is NA)
res[which(is.na(res$padj) & !is.na(res$pvalue)), ]
## log2 fold change (MLE): treatment diabetes vs control
## Wald test p-value: treatment diabetes vs control
## DataFrame with 2328 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSMUSG00000051951 2.95090 0.2797033 1.19429 0.2342000 0.8148297 NA
## ENSMUSG00000062588 1.55596 -0.4198212 1.65686 -0.2533830 0.7999723 NA
## ENSMUSG00000097797 2.82657 0.0206645 1.20774 0.0171101 0.9863488 NA
## ENSMUSG00000076135 1.71889 -1.2624674 1.72109 -0.7335278 0.4632366 NA
## ENSMUSG00000079671 2.75022 3.0156568 1.49177 2.0215301 0.0432249 NA
## ... ... ... ... ... ... ...
## ENSMUSG00000084920 3.39025 -0.1024447 1.16976 -0.0875778 0.930212 NA
## ENSMUSG00000084806 2.03762 -0.2693107 1.41819 -0.1898976 0.849389 NA
## ENSMUSG00000049176 1.85195 -0.0168883 1.45129 -0.0116368 0.990715 NA
## ENSMUSG00000087159 3.24779 -0.2997782 1.22270 -0.2451773 0.806319 NA
## ENSMUSG00000101059 4.62892 -0.7665180 1.02049 -0.7511268 0.452576 NA
Shrunken LFC results
plotMA(res)

# Shrunken LFC results
res_shrunken <- lfcShrink(
dds_full,
coef = str_c('treatment_', condition, '_vs_', control),
type = 'apeglm'
)
res_shrunken
## log2 fold change (MAP): treatment diabetes vs control
## Wald test p-value: treatment diabetes vs control
## DataFrame with 17151 rows and 5 columns
## baseMean log2FoldChange lfcSE pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSMUSG00000051951 2.95090 0.00187171 0.0971667 0.8148297 NA
## ENSMUSG00000025900 7.97288 -0.00158854 0.0974560 0.5172335 NA
## ENSMUSG00000033845 601.46353 -0.00604166 0.0785064 0.8868800 0.995641
## ENSMUSG00000102275 11.37339 0.00313091 0.0963428 0.8365339 0.990912
## ENSMUSG00000025903 784.45488 -0.20689645 0.1565421 0.0151979 0.226348
## ... ... ... ... ... ...
## ENSMUSG00000099876 42.68208 0.01527422 0.0954221 0.5551392 0.944317
## ENSMUSG00000068457 262.27768 -0.12770473 0.1809914 0.0391519 0.385684
## ENSMUSG00000069045 765.28122 -0.09192614 0.1033071 0.1147439 0.618083
## ENSMUSG00000101059 4.62892 -0.00696389 0.0973919 0.4525764 NA
## ENSMUSG00000096768 223.28712 -0.00906695 0.0971539 0.4959370 0.935226
plotMA(res_shrunken)

mcols(res_shrunken)
## DataFrame with 5 rows and 2 columns
## type description
## <character> <character>
## baseMean intermediate mean of normalized c..
## log2FoldChange results log2 fold change (MA..
## lfcSE results posterior SD: treatm..
## pvalue results Wald test p-value: t..
## padj results BH adjusted p-values
summary(res_shrunken, alpha = pval_cutoff)
##
## out of 17151 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 219, 1.3%
## LFC < 0 (down) : 167, 0.97%
## outliers [1] : 5, 0.029%
## low counts [2] : 3325, 19%
## (mean count < 8)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
Summary details
# Upregulated genes (LFC > 0)
res_shrunken_sig_df %>% filter(log2FoldChange > 0)
# Downregulated genes (LFC < 0)
res_shrunken_sig_df %>% filter(log2FoldChange < 0)
# Outliers (pvalue and padj are NA)
res_shrunken[which(is.na(res_shrunken$pvalue)), ]
## log2 fold change (MAP): treatment diabetes vs control
## Wald test p-value: treatment diabetes vs control
## DataFrame with 5 rows and 5 columns
## baseMean log2FoldChange lfcSE pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSMUSG00000058207 13.5434 -0.00268899 0.0975156 NA NA
## ENSMUSG00000027360 43.9822 0.00841953 0.0978813 NA NA
## ENSMUSG00000029368 113.3851 -0.00476900 0.0976345 NA NA
## ENSMUSG00000030324 111.9962 -0.00479276 0.0975251 NA NA
## ENSMUSG00000034837 44.1043 -0.00464703 0.0974973 NA NA
# Low counts (only padj is NA)
res_shrunken[which(is.na(res_shrunken$padj) & !is.na(res_shrunken$pvalue)), ]
## log2 fold change (MAP): treatment diabetes vs control
## Wald test p-value: treatment diabetes vs control
## DataFrame with 3325 rows and 5 columns
## baseMean log2FoldChange lfcSE pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSMUSG00000051951 2.95090 0.00187171 0.0971667 0.814830 NA
## ENSMUSG00000025900 7.97288 -0.00158854 0.0974560 0.517234 NA
## ENSMUSG00000062588 1.55596 -0.00146117 0.0973093 0.799972 NA
## ENSMUSG00000102135 6.74677 -0.00634244 0.0970827 0.579525 NA
## ENSMUSG00000103509 6.69317 -0.00234133 0.0969430 0.818834 NA
## ... ... ... ... ... ...
## ENSMUSG00000044583 7.56298 2.14353e-02 0.1002740 0.0729896 NA
## ENSMUSG00000049176 1.85195 -6.65622e-05 0.0972452 0.9907154 NA
## ENSMUSG00000087159 3.24779 -1.86927e-03 0.0971818 0.8063191 NA
## ENSMUSG00000072844 5.19664 -1.02531e-02 0.0978068 0.2827699 NA
## ENSMUSG00000101059 4.62892 -6.96389e-03 0.0973919 0.4525764 NA
Visualizing results
Heatmaps
# Plot normalized counts (z-scores)
pheatmap(counts_sig_norm[2:7],
color = brewer.pal(8, 'YlOrRd'),
cluster_rows = T,
show_rownames = F,
annotation_col = as.data.frame(colData(dds)) %>% select(label),
border_color = NA,
fontsize = 10,
scale = 'row',
fontsize_row = 10,
height = 20)

# Plot log-transformed counts
pheatmap(counts_sig_log[2:7],
color = rev(brewer.pal(8, 'RdYlBu')),
cluster_rows = T,
show_rownames = F,
annotation_col = as.data.frame(colData(dds)) %>% select(label),
border_color = NA,
fontsize = 10,
fontsize_row = 10,
height = 20)

# Plot log-transformed counts (top 24 DE genes)
pheatmap(counts_sig_log %>% filter(ensembl_gene_id %in% (res_sig_df %>% head(24))$ensembl_gene_id) %>% select(-ensembl_gene_id) %>% column_to_rownames(var = 'mgi_symbol'),
color = rev(brewer.pal(8, 'RdYlBu')),
cluster_rows = T,
show_rownames = T,
annotation_col = as.data.frame(colData(dds)) %>% select(label),
fontsize = 10,
fontsize_row = 10,
height = 20)

Volcano plots
# Unshrunken LFC
res_df %>%
mutate(
sig_threshold = if_else(
padj < pval_cutoff & abs(log2FoldChange) >= lfc_cutoff,
if_else(log2FoldChange > 0, 'DE-up', 'DE-down'),
'non-DE'
)
) %>%
filter(!is.na(sig_threshold)) %>%
ggplot() +
geom_point(aes(x = log2FoldChange, y = -log10(padj), colour = sig_threshold)) +
scale_color_manual(values = c('blue', 'red', 'gray')) +
xlab('log2 fold change') +
ylab('-log10 adjusted p-value')

# Shrunken LFC
res_shrunken_df %>%
mutate(
sig_threshold = if_else(
padj < pval_cutoff & abs(log2FoldChange) >= lfc_cutoff,
if_else(log2FoldChange > 0, 'DE-up', 'DE-down'),
'non-DE'
)
) %>%
filter(!is.na(sig_threshold)) %>%
ggplot() +
geom_point(aes(x = log2FoldChange, y = -log10(padj), colour = sig_threshold)) +
scale_color_manual(values = c('blue', 'red', 'gray')) +
xlab('log2 fold change') +
ylab('-log10 adjusted p-value')

GSEA (all)
Hallmark genesets
# Shrunken LFC
get_fgsea_res(rank_lfc, mm_h) %>% plot_enrichment_table(rank_lfc, mm_h)

# Wald stat
get_fgsea_res(rank_stat, mm_h) %>% plot_enrichment_table(rank_stat, mm_h)

# Rank: sign(LFC) * -log10(pvalue)
get_fgsea_res(rank_pval, mm_h) %>% plot_enrichment_table(rank_pval, mm_h)

GO biological process
# Shrunken LFC
get_fgsea_res(rank_lfc, mm_c5_bp) %>% plot_enrichment_table(rank_lfc, mm_c5_bp)

# Wald stat
get_fgsea_res(rank_stat, mm_c5_bp) %>% plot_enrichment_table(rank_stat, mm_c5_bp)

# Rank: sign(LFC) * -log10(pvalue)
get_fgsea_res(rank_pval, mm_c5_bp) %>% plot_enrichment_table(rank_pval, mm_c5_bp)

GO cellular component
# Shrunken LFC
get_fgsea_res(rank_lfc, mm_c5_cc) %>% plot_enrichment_table(rank_lfc, mm_c5_cc)

# Wald stat
get_fgsea_res(rank_stat, mm_c5_cc) %>% plot_enrichment_table(rank_stat, mm_c5_cc)

# Rank: sign(LFC) * -log10(pvalue)
get_fgsea_res(rank_pval, mm_c5_cc) %>% plot_enrichment_table(rank_pval, mm_c5_cc)

GO molecular function
# Shrunken LFC
get_fgsea_res(rank_lfc, mm_c5_mf) %>% plot_enrichment_table(rank_lfc, mm_c5_mf)

# Wald stat
get_fgsea_res(rank_stat, mm_c5_mf) %>% plot_enrichment_table(rank_stat, mm_c5_mf)

# Rank: sign(LFC) * -log10(pvalue)
get_fgsea_res(rank_pval, mm_c5_mf) %>% plot_enrichment_table(rank_pval, mm_c5_mf)

GSEA (DE)
Hallmark genesets
# Shrunken LFC
get_fgsea_res(rank_lfc, mm_h) %>% plot_enrichment_table(rank_lfc, mm_h)

# Wald stat
get_fgsea_res(rank_stat, mm_h) %>% plot_enrichment_table(rank_stat, mm_h)

# Rank: sign(LFC) * -log10(pvalue)
get_fgsea_res(rank_pval, mm_h) %>% plot_enrichment_table(rank_pval, mm_h)

GO biological process
# Shrunken LFC
get_fgsea_res(rank_lfc, mm_c5_bp) %>% plot_enrichment_table(rank_lfc, mm_c5_bp)

# Wald stat
get_fgsea_res(rank_stat, mm_c5_bp) %>% plot_enrichment_table(rank_stat, mm_c5_bp)

# Rank: sign(LFC) * -log10(pvalue)
get_fgsea_res(rank_pval, mm_c5_bp) %>% plot_enrichment_table(rank_pval, mm_c5_bp)

GO cellular component
# Shrunken LFC
get_fgsea_res(rank_lfc, mm_c5_cc) %>% plot_enrichment_table(rank_lfc, mm_c5_cc)

# Wald stat
get_fgsea_res(rank_stat, mm_c5_cc) %>% plot_enrichment_table(rank_stat, mm_c5_cc)

# Rank: sign(LFC) * -log10(pvalue)
get_fgsea_res(rank_pval, mm_c5_cc) %>% plot_enrichment_table(rank_pval, mm_c5_cc)

GO molecular function
# Shrunken LFC
get_fgsea_res(rank_lfc, mm_c5_mf) %>% plot_enrichment_table(rank_lfc, mm_c5_mf)

# Wald stat
get_fgsea_res(rank_stat, mm_c5_mf) %>% plot_enrichment_table(rank_stat, mm_c5_mf)

# Rank: sign(LFC) * -log10(pvalue)
get_fgsea_res(rank_pval, mm_c5_mf) %>% plot_enrichment_table(rank_pval, mm_c5_mf)

System info
sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
##
## Matrix products: default
## BLAS/LAPACK: /home/chan/mRNA_seq_pipeline/.snakemake/conda/9a19315a020c824d12f8055f7c009b0f/lib/libopenblasp-r0.3.18.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] fgsea_1.20.0 RColorBrewer_1.1-2 pheatmap_1.0.12 DESeq2_1.34.0 SummarizedExperiment_1.24.0 Biobase_2.54.0 MatrixGenerics_1.6.0 matrixStats_0.61.0 GenomicRanges_1.46.0 GenomeInfoDb_1.30.0 IRanges_2.28.0 S4Vectors_0.32.0 BiocGenerics_0.40.0 scales_1.1.1 forcats_0.5.1 stringr_1.4.0 dplyr_1.0.7 purrr_0.3.4 readr_2.1.1 tidyr_1.1.4 tibble_3.1.6 ggplot2_3.3.5 tidyverse_1.3.1
##
## loaded via a namespace (and not attached):
## [1] colorspace_2.0-2 ellipsis_0.3.2 XVector_0.34.0 fs_1.5.1 rstudioapi_0.13 farver_2.1.0 bit64_4.0.5 mvtnorm_1.1-3 AnnotationDbi_1.56.1 fansi_0.4.2 apeglm_1.16.0 lubridate_1.8.0 xml2_1.3.3 splines_4.1.0 cachem_1.0.6 geneplotter_1.72.0 knitr_1.35 jsonlite_1.7.2 broom_0.7.10 annotate_1.72.0 dbplyr_2.1.1 png_0.1-7 compiler_4.1.0 httr_1.4.2 backports_1.4.0 assertthat_0.2.1 Matrix_1.3-4 fastmap_1.1.0 cli_3.1.0 htmltools_0.5.2 tools_4.1.0 coda_0.19-4 gtable_0.3.0 glue_1.5.1 GenomeInfoDbData_1.2.7 fastmatch_1.1-3 Rcpp_1.0.7 bbmle_1.0.24 cellranger_1.1.0 jquerylib_0.1.4 vctrs_0.3.8 Biostrings_2.62.0 xfun_0.28 rvest_1.0.2 lifecycle_1.0.1 XML_3.99-0.8 MASS_7.3-54 zlibbioc_1.40.0 vroom_1.5.7 hms_1.1.1 parallel_4.1.0 yaml_2.2.1 memoise_2.0.1 gridExtra_2.3 emdbook_1.3.12 bdsmatrix_1.3-4 stringi_1.7.6 RSQLite_2.2.8 highr_0.9 genefilter_1.76.0 BiocParallel_1.28.0 rlang_0.4.12 pkgconfig_2.0.3 bitops_1.0-7 evaluate_0.14 lattice_0.20-45 labeling_0.4.2 bit_4.0.4 tidyselect_1.1.1 plyr_1.8.6 magrittr_2.0.1 R6_2.5.1 generics_0.1.1 DelayedArray_0.20.0 DBI_1.1.1 pillar_1.6.4 haven_2.4.3 withr_2.4.3 survival_3.2-13 KEGGREST_1.34.0 RCurl_1.98-1.5 modelr_0.1.8 crayon_1.4.2 utf8_1.2.2 tzdb_0.2.0 rmarkdown_2.11 locfit_1.5-9.4 grid_4.1.0 readxl_1.3.1 data.table_1.14.2 blob_1.2.2 reprex_2.0.1 digest_0.6.29 xtable_1.8-4 numDeriv_2016.8-1.1 munsell_0.5.0